library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library(SeuratData)
## ── Installed datasets ──────────────────────────────── SeuratData v0.2.2.9001 ──
## ✔ ifnb    3.1.0                         ✔ pbmc3k  3.1.4
## ✔ lungref 2.0.0
## ────────────────────────────────────── Key ─────────────────────────────────────
## ✔ Dataset loaded successfully
## ❯ Dataset built with a newer version of Seurat than installed
## ❓ Unknown version of Seurat installed
library(patchwork)
library(Azimuth)
## Registered S3 method overwritten by 'SeuratDisk':
##   method            from  
##   as.sparse.H5Group Seurat
## 
## Attaching shinyBS
library(sctransform)
library(ggplot2)
library(hdf5r)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reticulate)
library(DESeq2) # if want to perform DE using this option
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:hdf5r':
## 
##     values
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:sp':
## 
##     %over%
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
## 
##     Assays
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## 
## Attaching package: 'DESeq2'
## The following object is masked from 'package:sctransform':
## 
##     vst
library(reticulate)
library(BPCells)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%()    masks IRanges::%within%()
## ✖ IRanges::collapse()      masks dplyr::collapse()
## ✖ Biobase::combine()       masks BiocGenerics::combine(), dplyr::combine()
## ✖ matrixStats::count()     masks dplyr::count()
## ✖ IRanges::desc()          masks dplyr::desc()
## ✖ tidyr::expand()          masks S4Vectors::expand()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ S4Vectors::first()       masks dplyr::first()
## ✖ purrr::flatten_df()      masks hdf5r::flatten_df()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
## ✖ purrr::reduce()          masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ S4Vectors::rename()      masks dplyr::rename()
## ✖ lubridate::second()      masks S4Vectors::second()
## ✖ lubridate::second<-()    masks S4Vectors::second<-()
## ✖ IRanges::slice()         masks dplyr::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /vf/users/pengl7/IRF/sc-20250318/seurat_HDAC11_mouse_bladder
use_condaenv("/data/pengl7/conda/envs/r4seurat")
py_config()
## python:         /vf/users/pengl7/conda/envs/r4seurat/bin/python
## libpython:      /data/pengl7/conda/envs/r4seurat/lib/libpython3.11.so
## pythonhome:     /vf/users/pengl7/conda/envs/r4seurat:/vf/users/pengl7/conda/envs/r4seurat
## version:        3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:34:09) [GCC 12.3.0]
## numpy:          /vf/users/pengl7/conda/envs/r4seurat/lib/python3.11/site-packages/numpy
## numpy_version:  1.25.2
## 
## NOTE: Python version was forced by use_python() function
# need to first create a conda env as r4seurat; conda active r4seurat; conda install leidenalg; conda install numpy pandas; start rstudio&
leidenalg <- import("leidenalg")
merged <- readRDS(here("results", "merged_filtered_percentile_filtered.rds"))
merged
## An object of class Seurat 
## 23319 features across 30252 samples within 1 assay 
## Active assay: RNA (23319 features, 0 variable features)
##  4 layers present: counts.1KO, counts.1WT, counts.2KO, counts.2WT

Each sample has about 7K cells after initial data filtering

head(merged@meta.data)
##                        orig.ident nCount_RNA nFeature_RNA sample percent.mt
## 1KO_AAACCAAAGGCGATAG-1        1KO        578          377    1KO  0.1730104
## 1KO_AAACCAAAGTTGCTGA-1        1KO       2947         1669    1KO  6.4472345
## 1KO_AAACCCGCAAGCGCGT-1        1KO       1745         1071    1KO  0.6303725
## 1KO_AAACCCTGTTCGCACA-1        1KO       3828         2071    1KO  0.4963427
## 1KO_AAACCCTGTTGGACCC-1        1KO       6814         3087    1KO  0.1174053
## 1KO_AAACCGCTCAGCCATA-1        1KO      10135         3324    1KO  0.1282684
table(merged$orig.ident)  # or check other metadata columns like merged$condition
## 
##  1KO  1WT  2KO  2WT 
## 7187 8334 7095 7636

add meta inforjmation: condition, replicate, batch

merged$condition <- ifelse(grepl("WT", merged$orig.ident), "WT", "KO")
merged$replicate <- ifelse(grepl("1", merged$orig.ident), "1", "2")
head(merged$condition)
## 1KO_AAACCAAAGGCGATAG-1 1KO_AAACCAAAGTTGCTGA-1 1KO_AAACCCGCAAGCGCGT-1 
##                   "KO"                   "KO"                   "KO" 
## 1KO_AAACCCTGTTCGCACA-1 1KO_AAACCCTGTTGGACCC-1 1KO_AAACCGCTCAGCCATA-1 
##                   "KO"                   "KO"                   "KO"

Basic QC plots

  1. Before batch correction, check for potential variation across samples.
VlnPlot(merged, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), group.by = "orig.ident", pt.size = 0.1)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

FeatureScatter

nCount_RNA VS nFeature_RNA A strong correlation here is typical. Outliers (too many counts but few features) might be doublets or dead cells.

FeatureScatter(merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

Normalize and Identify Variable Genes

merged <- NormalizeData(merged)
## Normalizing layer: counts.1KO
## Normalizing layer: counts.1WT
## Normalizing layer: counts.2KO
## Normalizing layer: counts.2WT
merged <- FindVariableFeatures(merged, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts.1KO
## Finding variable features for layer counts.1WT
## Finding variable features for layer counts.2KO
## Finding variable features for layer counts.2WT
  1. PCA, UMAP & Clustering This is still pre-batch correction, just to explore structure:
merged <- ScaleData(merged, vars.to.regress = "percent.mt")  # Optional regression
## Regressing out percent.mt
## Centering and scaling data matrix
merged <- RunPCA(merged)
## PC_ 1 
## Positive:  Sh3gl2, Ankfn1, Mecom, Trp63, Ctnnd2, Ttc6, Pkhd1, 9530036O11Rik, Gm20528, Gata3 
##     Dock8, 2610035D17Rik, Tmprss2, Rab27b, Foxq1, Fmo5, Mir205hg, Crybg1, Gm49890, Gpr39 
##     Paqr5, Gm13219, Ugt1a6a, Rbm47, Ikzf2, Fer1l4, AC106834.1, Syt16, Tiam1, D730045B01Rik 
## Negative:  Bnc2, Cacnb2, Slc8a1, Slit3, Arhgap6, Carmn, Nrp2, Plcl1, Cacna1c, Sgcd 
##     Adam33, Ror1, Zfpm2, Synpo2, Itga1, Kcnq5, Adgrd1, Sdk1, Magi2, Lsamp 
##     Fbxl7, Pgm5, Fam129a, Dmd, Pdlim3, Myom1, Filip1, Pdzrn3, Sorbs1, Col25a1 
## PC_ 2 
## Positive:  Carmn, Col25a1, Myom1, Gm11946, Sntg2, Chrm3, Ryr3, Dgki, Hcn1, Kcnma1 
##     Synpo2, Rbfox3, Sorbs1, Pdlim3, Lrrc7, Myocd, Slc24a3, P2rx1, Dmd, Fhod3 
##     Cacna1c, Dtna, Art3, Itga1, Stac, Cap2, Akap6, Cacna2d3, Kcnn2, Dock3 
## Negative:  Zfp521, Pid1, Sox5, Col14a1, Xdh, Abi3bp, Lama2, Kcnt2, Cdk14, Plxdc2 
##     Gas7, Ebf1, Bicc1, Cfh, Cped1, Pdgfra, Cd55, Grk5, Fndc1, Man1a 
##     Gxylt2, Cgnl1, Apbb1ip, Fbln1, Dab2, Cntn4, Dcn, Cd34, Il1rapl1, Fbn1 
## PC_ 3 
## Positive:  Gpc6, Bicc1, Sox5, Adamtsl1, Cntn4, Col14a1, Dlc1, Lama2, Zfpm2, Tenm3 
##     Meg3, Cd55, Abi3bp, Adamtsl3, Ghr, Pdgfra, Col3a1, Col5a2, Pdzrn3, Fbln1 
##     Gxylt2, Sdk1, Kcnt2, Plxdc2, Magi2, Cdh11, Bnc2, Fndc1, Il1rapl1, Adamts2 
## Negative:  Dock2, Ptprc, Inpp5d, Arhgap15, Gm26740, Myo1f, March1, Ikzf1, Nrros, Ly86 
##     Cyth4, Epsti1, Mrc1, Tbxas1, Adgre1, F13a1, Arhgap45, F630028O10Rik, Cd84, Fli1 
##     St8sia4, Aoah, Mir142hg, Mctp1, Slc9a9, Arhgap30, Pirb, Pik3ap1, Tm6sf1, Lilrb4a 
## PC_ 4 
## Positive:  Adgrf5, Egfl7, Cyyr1, Ptprb, Pecam1, Emcn, Adgrl4, Erg, Ldb2, Rasgrf2 
##     Ushbp1, Dipk2b, Shank3, Flt1, Cdh5, St6galnac3, Kank3, Sncaip, Sema6a, Abcb1a 
##     Tll1, Vwf, Kdr, Palmd, Ablim3, Prkch, Podxl, Abcg2, Sox17, Tek 
## Negative:  Pid1, Zeb2, Apbb1ip, Arhgap15, Dock2, Ptprc, Slc9a9, March1, Gm26740, Myo1f 
##     Dock10, Arhgap45, Gpc6, Ly86, Adgre1, F13a1, Tbxas1, F630028O10Rik, Mrc1, Cntn4 
##     Rnf150, Cyth4, Epsti1, Abca9, Lama2, Ikzf1, Aoah, Ifi207, Cd84, Mitf 
## PC_ 5 
## Positive:  Ebf1, Prex2, Cped1, Fgd5, Lama2, Ptprb, Tek, Pdgfra, Jam2, Cntn4 
##     Pecam1, Cyyr1, Egfl7, Flt1, Hspa12b, Lhfpl2, Rnf144a, Dnm3, Emcn, Gsn 
##     Pi16, Calcrl, Adgrl4, Erg, Myrip, Cxcl12, Adam12, Ldb2, Scara5, Ntn1 
## Negative:  Muc16, Pcnx2, Wt1, Rspo1, Wt1os, Kcnd2, Gpm6a, Wdr17, Bnc1, Pkhd1l1 
##     Dpp4, Ptprq, Asic2, Pcdh15, Fgf1, Ddo, Msln, Cybrd1, Ptger3, Sntg1 
##     Slc39a8, Smpd3, Atp6v0a4, Nxph1, Prss12, Lrrn4, Il16, Col4a4, Dnajc6, Cldn15

check the difference by choosing pcs 30 and 50

ElbowPlot(merged, ndims = 50)

merged <- RunUMAP(merged, dims = 1:20)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:02:14 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:14 Read 30252 rows and found 20 numeric columns
## 14:02:14 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:14 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:02:16 Writing NN index file to temp file /lscratch/52602149/RtmpRKXtoc/file1ba6c12daf3cb0
## 14:02:16 Searching Annoy index using 1 thread, search_k = 3000
## 14:02:24 Annoy recall = 100%
## 14:02:25 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:02:27 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:02:28 Commencing optimization for 200 epochs, with 1351298 positive edges
## 14:02:38 Optimization finished
DimPlot(merged, label = TRUE)

merged <- RunUMAP(merged, dims = 1:30)
## 14:02:39 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:39 Read 30252 rows and found 30 numeric columns
## 14:02:39 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:02:39 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:02:41 Writing NN index file to temp file /lscratch/52602149/RtmpRKXtoc/file1ba6c139b9fecf
## 14:02:41 Searching Annoy index using 1 thread, search_k = 3000
## 14:02:48 Annoy recall = 100%
## 14:02:49 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:02:51 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:02:53 Commencing optimization for 200 epochs, with 1364094 positive edges
## 14:03:03 Optimization finished
DimPlot(merged, label = TRUE)

# chose PCs=30 to perform clustering

merged <- RunUMAP(merged, dims = 1:30)
## 14:03:04 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:03:04 Read 30252 rows and found 30 numeric columns
## 14:03:04 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## 14:03:04 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:03:06 Writing NN index file to temp file /lscratch/52602149/RtmpRKXtoc/file1ba6c175f67d64
## 14:03:06 Searching Annoy index using 1 thread, search_k = 3000
## 14:03:13 Annoy recall = 100%
## 14:03:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:03:17 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:03:18 Commencing optimization for 200 epochs, with 1364094 positive edges
## 14:03:28 Optimization finished
merged <- FindNeighbors(merged, dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
merged <- FindClusters(merged, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 30252
## Number of edges: 1164289
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9400
## Number of communities: 20
## Elapsed time: 5 seconds

Visualize clusters:

DimPlot(merged, group.by = "seurat_clusters", label = TRUE)

merged <- FindClusters(merged, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 30252
## Number of edges: 1164289
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9480
## Number of communities: 18
## Elapsed time: 4 seconds
DimPlot(merged, group.by = "seurat_clusters", label = TRUE)

merged <- FindClusters(merged, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 30252
## Number of edges: 1164289
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9578
## Number of communities: 15
## Elapsed time: 5 seconds
DimPlot(merged, group.by = "seurat_clusters", label = TRUE)

#install.packages("clustree")
library(clustree)
## Loading required package: ggraph
## 
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
## 
##     geometry
clustree(merged, prefix = "RNA_snn_res.")

Assess cluster quality of r0.2

Need to join layers before performing FindAllMarkers since I merged 4 samples together which keep their indiviaul data and count slots

merged <- JoinLayers(
  object = merged,
  assay = "RNA",
  slot = "data",                     # or "counts" if you're starting from raw counts
  new.layer = "joined_normalized"    # name for the new combined layer
)

Before JoinLayer 4 layers present: counts.1KO, counts.1WT, counts.2KO, counts.2WT After JoinLayer: 3 layers present: data, counts, scale.data

merged
## An object of class Seurat 
## 23319 features across 30252 samples within 1 assay 
## Active assay: RNA (23319 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  2 dimensional reductions calculated: pca, umap

Use marker genes to check whether these clusters represent distinct biological identities:

Idents(merged) <- "seurat_clusters"
markers <- FindAllMarkers(
  merged,
  only.pos = TRUE,
  min.pct = 0.1,
  logfc.threshold = 0.1
)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
top10 <- markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(merged, features = top10$gene) + NoLegend()
## Warning in DoHeatmap(merged, features = top10$gene): The following features
## were omitted as they were not found in the scale.data slot for the RNA assay:
## BC035947, A530053G22Rik, Sucnr1, Gm454, Gm13403, Gm12688, Gm10475, Duox2,
## Arid3c, Ugt2b1, Gm26777, Skint7, Gm32772, Slc9a4, Mroh3, Zfp872, Elfn1, Dsp,
## Uchl1, Krt8, Krt5, Gm36535, Gm24474, Gm34639, Baiap2l2, 5830418P13Rik, Gm44737,
## Vwa5b2, Gm12295, Zp3r, Antxrl, B230312C02Rik, 4933413L06Rik, Ckm, Hspb6, Flnc,
## Myl9, Synm

visualize specific genes

FeaturePlot(merged, features = c("Upk3a", "Acta2", "Col1a1", "Cd3e", "Cd79a"))

DotPlot(merged, features = c("Upk3a", "Acta2", "Col1a1", "Cd3e", "Cd79a")) + RotatedAxis()

DotPlot(merged, features = list(
  Urothelial = c("Upk3a", "Krt20", "Krt8"),
  SmoothMuscle = c("Acta2", "Tagln", "Myh11"),
  Fibroblast = c("Col1a1", "Dcn", "Pdgfra"),
  Tcell = c("Cd3e", "Cd3d", "Trac"),
  Bcell = c("Cd79a", "Ms4a1"),
  Macrophage = c("Lyz2", "Adgre1")
)) + RotatedAxis()
## Warning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.
## ℹ Please use the `rows` argument instead.
## ℹ The deprecated feature was likely used in the Seurat package.
##   Please report the issue at <https://github.com/satijalab/seurat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

VlnPlot(merged, features = c("Acta2", "Col1a1", "Cd3e"), group.by = "seurat_clusters", pt.size = 0.1)

Step-by-Step: Check for Batch Effects in Seurat

Plot UMAP by batch/sample/condition

# Color by sample identity
DimPlot(merged, group.by = "orig.ident", label = TRUE, repel = TRUE)

# Optional: split UMAP by sample or condition
DimPlot(merged, split.by = "orig.ident", group.by = "seurat_clusters")

DimPlot(merged, split.by = "condition", group.by = "seurat_clusters")

Visualize sample composition per cluster

This helps quantify how balanced each cluster is across samples:

# Table of cluster × sample counts
tbl <- table(merged$seurat_clusters, merged$orig.ident)

# Reorder samples: WT1, WT2, KO1, KO2
sample_order <- c("1WT", "2WT", "1KO", "2KO")  # adjust based on your actual sample names
# Reorder the table columns
tbl <- tbl[, sample_order]
# Colors for clusters — use a consistent vector across plot & legend
cluster_colors <- 1:nrow(tbl)  # or you can define manually, like c("black", "red", ...)
# Proportions per sample
tbl_prop <- prop.table(tbl, margin = 2)

# Plot: That means 4 colors (recycled for each group of bars)
par(mar = c(5, 4, 4, 8), xpd = TRUE)
barplot(tbl_prop, beside = TRUE, col = 1:4,
        main = "Sample composition per cluster",
        ylab = "Proportion")

# Legend
legend("topright", inset = c(-0.2, 0),
       legend = rownames(tbl),
       fill = cluster_colors,
       title = "Cluster",
       cex = 0.8)

# Normalize proportions per cluster
tbl_prop <- prop.table(tbl, margin = 2)

# Set layout so there's space on the right for the legend
par(mar = c(5, 4, 4, 8), xpd = TRUE)

# Barplot with room for the legend
barplot(tbl_prop,
        beside = TRUE,
        col = rainbow(nrow(tbl)),
        main = "Sample composition per cluster",
        ylab = "Proportion",
        las = 2)

# Add legend to the right
legend("topright", inset = c(-0.2, 0), legend = rownames(tbl),
       fill = rainbow(nrow(tbl)), title = "Cluster", cex = 0.8)

as.data.frame(tbl) %>%
  ggplot(aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Cluster", y = "Proportion", fill = "Sample") +
  theme_minimal()

save the rds which has been clustred and joinedLayer

saveRDS(merged, file = here("results", "layerJoined_clusted.rds"))